library(ggplot2)
library(ISLR)
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(corrplot)
## corrplot 0.92 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(splines)
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
data(Wage)
goal)다른 변수들을 이용해서 wage를 예측하는 것을 목표로 한다.
head(Wage)
## year age maritl race education region
## 231655 2006 18 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic
## 86582 2004 24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003 45 2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003 43 2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443 2005 50 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic
## 376662 2008 54 2. Married 1. White 4. College Grad 2. Middle Atlantic
## jobclass health health_ins logwage wage
## 231655 1. Industrial 1. <=Good 2. No 4.318063 75.04315
## 86582 2. Information 2. >=Very Good 2. No 4.255273 70.47602
## 161300 1. Industrial 1. <=Good 1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good 1. Yes 5.041393 154.68529
## 11443 2. Information 1. <=Good 1. Yes 4.318063 75.04315
## 376662 2. Information 2. >=Very Good 1. Yes 4.845098 127.11574
summary(Wage)
## year age maritl race
## Min. :2003 Min. :18.00 1. Never Married: 648 1. White:2480
## 1st Qu.:2004 1st Qu.:33.75 2. Married :2074 2. Black: 293
## Median :2006 Median :42.00 3. Widowed : 19 3. Asian: 190
## Mean :2006 Mean :42.41 4. Divorced : 204 4. Other: 37
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## education region jobclass
## 1. < HS Grad :268 2. Middle Atlantic :3000 1. Industrial :1544
## 2. HS Grad :971 1. New England : 0 2. Information:1456
## 3. Some College :650 3. East North Central: 0
## 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## health health_ins logwage wage
## 1. <=Good : 858 1. Yes:2083 Min. :3.000 Min. : 20.09
## 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447 1st Qu.: 85.38
## Median :4.653 Median :104.92
## Mean :4.654 Mean :111.70
## 3rd Qu.:4.857 3rd Qu.:128.68
## Max. :5.763 Max. :318.34
##
str(Wage)
## 'data.frame': 3000 obs. of 11 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
observations : 3000
variables : 11
1.year :임금 정보가 기록된 연도
2.age :노동자의 나이
3.maritl :{1:결혼안함, 2:기혼, 3:사별, 4:이혼, 5:별거} -> 노동자의
결혼 상태
4.race :{1:백인, 2:흑인, 3:아시안, 4:기타} -> 노동자의 인종
5.education :{1:고졸미만, 2:고졸, 3:일부 대학, 4:대학 졸업, 5:대졸 이상}
-> 교육 수준
6.region :지역
7.jobclass :{1:산업, 2:정보} -> 직업 분류
8.health :{1:좋음 이하, 2:매우 좋음 이상} -> 노동자의 건강 상태
9.health_ins :{1:있음, 2:없음} -> 건강 보험 여부
10.logwage : 로그 스케일 임금
11.wage : 임금
해당 데이터셋에서 wage를 반응 변수로 선정했고, 나머지 변수들을
예측변수의 후보로 선정하였습니다.
[선형성, 등분산성, 정규성]에 대한 검증을 진행하였습니다.
wage_model = lm(wage ~ year + age + maritl + race + education + jobclass + health + health_ins, data = Wage)
summary(wage_model)
##
## Call:
## lm(formula = wage ~ year + age + maritl + race + education +
## jobclass + health + health_ins, data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -100.33 -18.70 -3.26 13.29 212.79
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.423e+03 6.165e+02 -3.931 8.67e-05 ***
## year 1.241e+00 3.074e-01 4.037 5.54e-05 ***
## age 2.707e-01 6.223e-02 4.350 1.41e-05 ***
## maritl2. Married 1.718e+01 1.720e+00 9.985 < 2e-16 ***
## maritl3. Widowed 2.052e+00 8.005e+00 0.256 0.79774
## maritl4. Divorced 3.967e+00 2.887e+00 1.374 0.16951
## maritl5. Separated 1.153e+01 4.844e+00 2.380 0.01736 *
## race2. Black -5.096e+00 2.146e+00 -2.375 0.01760 *
## race3. Asian -2.814e+00 2.603e+00 -1.081 0.27978
## race4. Other -6.059e+00 5.666e+00 -1.069 0.28505
## education2. HS Grad 7.759e+00 2.369e+00 3.275 0.00107 **
## education3. Some College 1.834e+01 2.520e+00 7.278 4.32e-13 ***
## education4. College Grad 3.124e+01 2.548e+00 12.259 < 2e-16 ***
## education5. Advanced Degree 5.395e+01 2.811e+00 19.190 < 2e-16 ***
## jobclass2. Information 3.571e+00 1.324e+00 2.697 0.00704 **
## health2. >=Very Good 6.515e+00 1.421e+00 4.585 4.72e-06 ***
## health_ins2. No -1.751e+01 1.403e+00 -12.479 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34 on 2983 degrees of freedom
## Multiple R-squared: 0.3396, Adjusted R-squared: 0.3361
## F-statistic: 95.89 on 16 and 2983 DF, p-value: < 2.2e-16
# Linearity
plot(wage_model$fitted.values, Wage$wage,
xlab = "Fitted values",
ylab = "Actual values",
main = "Linerity:Fitted vs Actual",
abline(0, 1, col = "red")
)
#Homoscedasticity
plot(wage_model$fitted.values, residuals(wage_model),
xlab = "Fitted values",
ylab = "Residuals",
main = "Residuals vs Fitted Values")
abline(h = 0, col = "red")
#Normality
qqPlot(wage_model, main = "Q-Q Plot of Residuals")
## 154582 233833
## 504 2713
2.Homoscedasticity(등분산성) residual plot을 통해서 확인해 본 결과, 잔차에서 특정한 패턴이 보이진 않았습니다. 선형 회귀 모델이 데이터를 잘 설명하고 있다고 할 수 있습니다.
결론) 해당 데이터에는 선형모델이 적합하지 않다고 결론을 내립니다.
# 1. correlation
num_vars <- Wage %>% select(year, age, maritl, race, education, jobclass, health, health_ins, wage)
num_vars$maritl <- as.numeric(num_vars$maritl) - 1
num_vars$race <- as.numeric(num_vars$race) - 1
num_vars$education <- as.numeric(num_vars$education) - 1
num_vars$jobclass <- as.numeric(num_vars$jobclass) - 1
num_vars$health <- as.numeric(num_vars$health) - 1
num_vars$health_ins <- as.numeric(num_vars$health_ins) - 1
cor_matrix <- cor(num_vars, use="complete.obs") #remove missing value
corrplot(cor_matrix, method = "number")
wage와 다른 변수들간의 상관관계를 확인한 결과, education이 0.48로 가장 높은 수치를 보였다.
set.seed(123)
Wage = na.omit(Wage)
#train : test = 0.8 : 0.2
trainIndex <- createDataPartition(Wage$wage, p = 0.8, list = FALSE)
trainset_wage <- Wage[trainIndex, ]
testset_wage <- Wage[-trainIndex, ]
ctrl = trainControl(method = "cv", number = 10) # 10-fold cross-validation
max_degree = 10
cv_errors = rep(NA, max_degree)
model = train(wage ~ poly(education, 1, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[1] = mean(model$results$RMSE^2)
model = train(wage ~ poly(education, 2, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[2] = mean(model$results$RMSE^2)
model = train(wage ~ poly(education, 3, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[3] = mean(model$results$RMSE^2)
model = train(wage ~ poly(education, 4, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[4] = mean(model$results$RMSE^2)
model = train(wage ~ poly(education, 5, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[5] = mean(model$results$RMSE^2)
model = train(wage ~ poly(education, 6, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[6] = mean(model$results$RMSE^2)
model = train(wage ~ poly(education, 7, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[7] = mean(model$results$RMSE^2)
model = train(wage ~ poly(education, 8, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[8] = mean(model$results$RMSE^2)
model = train(wage ~ poly(education, 9, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[9] = mean(model$results$RMSE^2)
model = train(wage ~ poly(education, 10, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[10] = mean(model$results$RMSE^2)
# 교차 검증 오차를 그래프로 시각화
plot(1:max_degree, cv_errors, type = "b",
xlab = "Degree of Polynomial", ylab = "Cross-Validation Error",
main = "Cross-Validation Error vs. Polynomial Degree")
ctrl = trainControl(method = "cv", number = 10) # 10-fold cross-validation
max_degree = 10
cv_errors = rep(NA, max_degree)
model = train(wage ~ poly(age, 1, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[1] = mean(model$results$RMSE^2)
model = train(wage ~ poly(age, 2, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[2] = mean(model$results$RMSE^2)
model = train(wage ~ poly(age, 3, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[3] = mean(model$results$RMSE^2)
model = train(wage ~ poly(age, 4, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[4] = mean(model$results$RMSE^2)
model = train(wage ~ poly(age, 5, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[5] = mean(model$results$RMSE^2)
model = train(wage ~ poly(age, 6, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[6] = mean(model$results$RMSE^2)
model = train(wage ~ poly(age, 7, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[7] = mean(model$results$RMSE^2)
model = train(wage ~ poly(age, 8, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[8] = mean(model$results$RMSE^2)
model = train(wage ~ poly(age, 9, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[9] = mean(model$results$RMSE^2)
model = train(wage ~ poly(age, 10, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[10] = mean(model$results$RMSE^2)
plot(1:max_degree, cv_errors, type = "b",
xlab = "Degree of Polynomial", ylab = "Cross-Validation Error",
main = "Cross-Validation Error vs. Degree of polynomial")
10-fold cross-validation을 wage ~ education, wage ~ age에 대해서 각각 진행한 결과, age를 예측변수로 사용한 쪽이 MSE가 훨씬 안정적으로 수렴하고 있다는 것을 확인할 수 있었다. 따라서, wage를 예측하는 데 있어서 age가 더 적절하다는 판단을 내린다.
max_degree = 10
cv_errors_sp = rep(NA, max_degree)
model_spline = train(wage ~ ns(age, df = 1), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[1] = mean(model_spline$results$RMSE^2)
model_spline = train(wage ~ ns(age, df = 2), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[2] = mean(model_spline$results$RMSE^2)
model_spline = train(wage ~ ns(age, df = 3), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[3] = mean(model_spline$results$RMSE^2)
model_spline = train(wage ~ ns(age, df = 4), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[4] = mean(model_spline$results$RMSE^2)
model_spline = train(wage ~ ns(age, df = 5), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[5] = mean(model_spline$results$RMSE^2)
model_spline = train(wage ~ ns(age, df = 6), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[6] = mean(model_spline$results$RMSE^2)
model_spline = train(wage ~ ns(age, df = 7), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[7] = mean(model_spline$results$RMSE^2)
model_spline = train(wage ~ ns(age, df = 8), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[8] = mean(model_spline$results$RMSE^2)
model_spline = train(wage ~ ns(age, df = 9), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[9] = mean(model_spline$results$RMSE^2)
model_spline = train(wage ~ ns(age, df = 10), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[10] = mean(model_spline$results$RMSE^2)
plot(1:max_degree, cv_errors_sp, type = "b",
xlab = "Degree of natural spline", ylab = "Cross-Validation Error",
main = "Cross-Validation Error vs Degree of natural splne")
#linear model
lm_model = lm(wage ~ age, data = trainset_wage)
poly_model_d2 = lm(wage ~ poly(age, 2, raw=TRUE), data = trainset_wage)
ns_model_d2 = lm(wage ~ ns(age, df = 2), data = trainset_wage)
poly_predict = predict(poly_model_d2, newdata = testset_wage)
ns_predict = predict(ns_model_d2, newdata = testset_wage)
lm_predict = predict(lm_model, newdata = testset_wage)
MSE_poly = mean((testset_wage$wage - poly_predict)^2)
MSE_ns = mean((testset_wage$wage - ns_predict)^2)
MSE_lm = mean((testset_wage$wage - lm_predict)^2)
print(MSE_poly)
## [1] 1694.414
print(MSE_ns)
## [1] 1700.754
print(MSE_lm)
## [1] 1745.15
Polynomial model
degree : 2 MSE : 1694.414
Natural spline
degree : 2 MSE : 1700.754
Linear model MSE : 1745.15
해당 wage 변수를 추정할 때, linear model보다 non-linear model이 더 좋은 성능을 보인다는 사실을 확인했다. 또한, 근소한 차이로 polynomial model이 Natural spline 모델보다 더 높은 성능을 보였다.
data(Auto)
goal)다른 변수들을 이용해서 mpg(연비)를 예측하는 것을 목표로 한다.
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
observations : 392
variables : 9
1.mpg : 연비(마일 당 갤런)
2.cylinders : 실린더 수
3.displacement: 엔진 배기량
4.horsepower: 엔진의 마력
5.weight: 차의 무게
6.acceleration: 0에서 60마일에 도달하는 시간 (초 단위)
7.year: 자동차 제조 연도
8.origin: {1:미국, 2:유럽, 3:일본} -> 자동차의 제조국
9.name: 자동차 모델 이름
자동차의 제조사 및 소비자 입장에서 mpg가 가장 주요한 변수 중 하나라고 생각했기 때문에, 해당 변수를 반응변수로 선택하였습니다.
auto_model = lm(mpg ~ . -name, data = Auto)
summary(auto_model)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
# Linearity
plot(auto_model$fitted.values, Auto$mpg,
xlab = "Fitted values",
ylab = "Actual values",
main = "Linerity:Fitted vs Actual",
abline(0, 1, col = "red")
)
#Homoscedasticity
plot(auto_model$fitted.values, residuals(auto_model),
xlab = "Fitted values",
ylab = "Residuals",
main = "Residuals vs Fitted Values")
abline(h = 0, col = "red")
#Normality
qqPlot(auto_model, main = "Q-Q Plot of Residuals")
## 323 327
## 321 325
1.Linearity(선형성) fitted value와 actual value 사이에 선형적인 관계가 나타난다는 사실을 확인할 수 있었습니다.
2.Homoscedasticity(등분산성) residual plot을 통해서 확인해 본 결과, 잔차에서 2차 함수와 같은 패턴을 확인할 수 있었습니다. 선형 회귀 모델이 데이터를 잘 설명하고 있지 않다고 결론을 내릴 수 있었습니다.
3.Normality(정규성) Q-Q plot을 그려본 결과, residual이 정규 분포를 따르지 않았습니다.
결론)해당 데이터셋에는 linear model이 적합하지 않다는 결론을 내릴 수 있었습니다.
# 1. correlation
num_vars <- Auto %>% select(mpg, cylinders, displacement, horsepower, weight, acceleration, year, origin)
num_vars$origin <- as.numeric(num_vars$origin) - 1
cor_matrix <- cor(num_vars, use="complete.obs") #remove missing value
corrplot(cor_matrix, method = "number")
mpg는 cylinders, displacement, horsepower, weight과 큰 음의 상관관계를
보이는 것을 확인할 수 있었습니다. 따라서, mpg 예측에 cylinders,
displacement, horsepower, weight를 예측변수의 후보로 사용합니다.
ggplot(Auto, aes(x = cylinders)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Histogram of cylinder in the Auto Dataset",
x = "cylinders",
y = "count") +
theme_minimal()
ggplot(Auto, aes(x = displacement)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Histogram of displacement in the Auto Dataset",
x = "displacement",
y = "count") +
theme_minimal()
ggplot(Auto, aes(x = horsepower)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Histogram of horsepower in the Auto Dataset",
x = "horsepower",
y = "count") +
theme_minimal()
ggplot(Auto, aes(x = weight)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Histogram of weight in the Auto Dataset",
x = "weight",
y = "count") +
theme_minimal()
모델 fitting 시, cylinder와 displacement는 제외하기로 한다.
set.seed(123)
Auto = na.omit(Auto)
#train : test = 0.8 : 0.2
trainIndex = createDataPartition(Auto$mpg, p = 0.8, list = FALSE)
trainset_auto <- Auto[trainIndex, ]
testset_auto <- Auto[-trainIndex, ]
ctrl = trainControl(method = "cv", number = 10) # 10-fold cross-validation
max_degree = 10
cv_errors_auto = rep(NA, max_degree)
model = train(mpg ~ poly(horsepower, 1, raw=TRUE) + poly(weight, 1, raw = TRUE), data = trainset_auto,
method = "lm", trControl = ctrl)
cv_errors_auto[1] = mean(model$results$RMSE^2)
model = train(mpg ~ poly(horsepower, 2, raw=TRUE) + poly(weight, 2, raw = TRUE), data = trainset_auto,
method = "lm", trControl = ctrl)
cv_errors_auto[2] = mean(model$results$RMSE^2)
model = train(mpg ~ poly(horsepower, 3, raw=TRUE) + poly(weight, 3, raw = TRUE), data = trainset_auto,
method = "lm", trControl = ctrl)
cv_errors_auto[3] = mean(model$results$RMSE^2)
model = train(mpg ~ poly(horsepower, 4, raw=TRUE) + poly(weight, 4, raw = TRUE), data = trainset_auto,
method = "lm", trControl = ctrl)
cv_errors_auto[4] = mean(model$results$RMSE^2)
model = train(mpg ~ poly(horsepower, 5, raw=TRUE) + poly(weight, 5, raw = TRUE), data = trainset_auto,
method = "lm", trControl = ctrl)
cv_errors_auto[5] = mean(model$results$RMSE^2)
model = train(mpg ~ poly(horsepower, 6, raw=TRUE) + poly(weight, 6, raw = TRUE), data = trainset_auto,
method = "lm", trControl = ctrl)
cv_errors_auto[6] = mean(model$results$RMSE^2)
model = train(mpg ~ poly(horsepower, 7, raw=TRUE) + poly(weight, 7, raw = TRUE), data = trainset_auto,
method = "lm", trControl = ctrl)
cv_errors_auto[7] = mean(model$results$RMSE^2)
model = train(mpg ~ poly(horsepower, 8, raw=TRUE) + poly(weight, 8, raw = TRUE), data = trainset_auto,
method = "lm", trControl = ctrl)
cv_errors_auto[8] = mean(model$results$RMSE^2)
model = train(mpg ~ poly(horsepower, 9, raw=TRUE) + poly(weight, 9, raw = TRUE), data = trainset_auto,
method = "lm", trControl = ctrl)
cv_errors_auto[9] = mean(model$results$RMSE^2)
model = train(mpg ~ poly(horsepower, 10, raw=TRUE) + poly(weight, 10, raw = TRUE), data = trainset_auto,
method = "lm", trControl = ctrl)
cv_errors_auto[10] = mean(model$results$RMSE^2)
plot(1:max_degree, cv_errors_auto, type = "b",
xlab = "Degree of polynomial", ylab = "Cross-Validation Error",
main = "Cross-Validation Error vs Degree of polynomial")
##### Spline model
max_degree = 10
cv_errors_auto = rep(NA, max_degree)
model_spline = train(mpg ~ ns(horsepower, df = 1) + ns(weight, df = 1), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[1] = mean(model_spline$results$RMSE^2)
model_spline = train(mpg ~ ns(horsepower, df = 2) + ns(weight, df = 2), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[2] = mean(model_spline$results$RMSE^2)
model_spline = train(mpg ~ ns(horsepower, df = 3) + ns(weight, df = 3), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[3] = mean(model_spline$results$RMSE^2)
model_spline = train(mpg ~ ns(horsepower, df = 4) + ns(weight, df = 4), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[4] = mean(model_spline$results$RMSE^2)
model_spline = train(mpg ~ ns(horsepower, df = 5) + ns(weight, df = 5), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[5] = mean(model_spline$results$RMSE^2)
model_spline = train(mpg ~ ns(horsepower, df = 6) + ns(weight, df = 6), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[6] = mean(model_spline$results$RMSE^2)
model_spline = train(mpg ~ ns(horsepower, df = 7) + ns(weight, df = 7), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[7] = mean(model_spline$results$RMSE^2)
model_spline = train(mpg ~ ns(horsepower, df = 8) + ns(weight, df = 8), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[8] = mean(model_spline$results$RMSE^2)
model_spline = train(mpg ~ ns(horsepower, df = 9) + ns(weight, df = 9), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[9] = mean(model_spline$results$RMSE^2)
model_spline = train(mpg ~ ns(horsepower, df = 10) + ns(weight, df = 10), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[10] = mean(model_spline$results$RMSE^2)
plot(1:max_degree, cv_errors_auto, type = "b",
xlab = "Degree of polynomial", ylab = "Cross-Validation Error",
main = "Cross-Validation Error vs Degree of natural spline")
polynomial model과 natural spline의 CV MSE를 비교한 결과, 둘다 df가
2일 때부터 유의미한 성능 향상이 나타났다.
따라서, df가 2인 모델을 선정하기로 한다.
poly_model_auto_d2 = lm(mpg ~ poly(weight, 2, raw=TRUE) + poly(horsepower, 2, raw=TRUE), data = trainset_auto)
ns_model_auto_d2 = lm(mpg ~ ns(weight, df = 2) + ns(horsepower, df= 2), data = trainset_auto)
linear_model_auto = lm(mpg ~ weight + horsepower, data = trainset_auto)
poly_predict_auto = predict(poly_model_auto_d2, newdata = testset_auto)
ns_predict_auto = predict(ns_model_auto_d2, newdata = testset_auto)
lm_predict_auto = predict(linear_model_auto, newdata = testset_auto)
MSE_poly_auto = mean((testset_auto$mpg - poly_predict_auto)^2)
MSE_ns_auto = mean((testset_auto$mpg - ns_predict_auto)^2)
MSE_lm_auto = mean((testset_auto$mpg - lm_predict_auto)^2)
print(MSE_poly_auto)
## [1] 13.21094
print(MSE_ns_auto)
## [1] 13.13274
print(MSE_lm_auto)
## [1] 16.47695
Polynomial model
degree : 2 MSE : 13.21094
Natural spline
degree : 2 MSE : 13.13274
Linear model MSE : 16.47695
mpg 변수를 추정할 때, linear model보다 non-linear model이 더 좋은 성능을 보인다는 사실을 알 수 있었습니다. polynomial과 Natural spline의 경우에는 유의미한 성능 차이를 볼 수 없었습니다.
data(College)
head(College)
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Agnes Scott College 510 63 12960 5450 450
## Alaska Pacific University 249 869 7560 4120 800
## Albertson College 678 41 13500 3335 500
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Agnes Scott College 875 92 97 7.7 37 19016
## Alaska Pacific University 1500 76 72 11.9 2 10922
## Albertson College 675 67 73 9.4 11 9727
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
summary(College)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
str(College)
## 'data.frame': 777 obs. of 18 variables:
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : num 1660 2186 1428 417 193 ...
## $ Accept : num 1232 1924 1097 349 146 ...
## $ Enroll : num 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : num 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : num 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: num 2885 2683 1036 510 249 ...
## $ P.Undergrad: num 537 1227 99 63 869 ...
## $ Outstate : num 7440 12280 11250 12960 7560 ...
## $ Room.Board : num 3300 6450 3750 5450 4120 ...
## $ Books : num 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : num 2200 1500 1165 875 1500 ...
## $ PhD : num 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : num 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: num 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : num 7041 10527 8735 19016 10922 ...
## $ Grad.Rate : num 60 56 54 59 15 55 63 73 80 52 ...
observations : 777
variables : 18
1.Private: 사립 대학 여부 (Yes/No)
2.Apps: 접수된 지원서 수
3.Accept: 합격한 학생 수
4.Enroll: 등록한 학생 수
5.Top10perc: 상위 10%의 고등학교 성적을 가진 학생 비율
6.Top25perc: 상위 25%의 고등학교 성적을 가진 학생 비율
7.F.Undergrad: 풀타임 학부생 수
8.P.Undergrad: 파트타임 학부생 수
9.outstate: 외지 학생 등록금 Room.Board: 숙소 및 식사비
10.Books: 교재비
11.Personal: 개인 경비
12.PhD: 박사학위 보유한 교수 비율
13.Terminal: 석사학위 보유한 교수 비율
14.S.F Ratio : 학생/교수 비율
15.perc.alumni: 기부한 동문 비율
16.Expend: 학생 당 교육 비용 Grad.Rate: 졸업생 비율
대학 또는 지원자들 입장에서 Apps가 주요한 관심사 중 하나라고 생각했기 때문에, 해당 변수를 예측변수로 설정하였습니다.
college_model = lm(Apps ~ ., data = College)
summary(college_model)
##
## Call:
## lm(formula = Apps ~ ., data = College)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4908.8 -430.2 -29.5 322.3 7852.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -445.08413 408.32855 -1.090 0.276053
## PrivateYes -494.14897 137.81191 -3.586 0.000358 ***
## Accept 1.58581 0.04074 38.924 < 2e-16 ***
## Enroll -0.88069 0.18596 -4.736 2.60e-06 ***
## Top10perc 49.92628 5.57824 8.950 < 2e-16 ***
## Top25perc -14.23448 4.47914 -3.178 0.001543 **
## F.Undergrad 0.05739 0.03271 1.754 0.079785 .
## P.Undergrad 0.04445 0.03214 1.383 0.167114
## Outstate -0.08587 0.01906 -4.506 7.64e-06 ***
## Room.Board 0.15103 0.04829 3.127 0.001832 **
## Books 0.02090 0.23841 0.088 0.930175
## Personal 0.03110 0.06308 0.493 0.622060
## PhD -8.67850 4.63814 -1.871 0.061714 .
## Terminal -3.33066 5.09494 -0.654 0.513492
## S.F.Ratio 15.38961 13.00622 1.183 0.237081
## perc.alumni 0.17867 4.10230 0.044 0.965273
## Expend 0.07790 0.01235 6.308 4.79e-10 ***
## Grad.Rate 8.66763 2.94893 2.939 0.003390 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1041 on 759 degrees of freedom
## Multiple R-squared: 0.9292, Adjusted R-squared: 0.9276
## F-statistic: 585.9 on 17 and 759 DF, p-value: < 2.2e-16
# Linearity
plot(college_model$fitted.values, College$Apps,
xlab = "Fitted values",
ylab = "Actual values",
main = "Linerity:Fitted vs Actual",
abline(0, 1, col = "red")
)
#Homoscedasticity
plot(college_model$fitted.values, residuals(college_model),
xlab = "Fitted values",
ylab = "Residuals",
main = "Residuals vs Fitted Values")
abline(h = 0, col = "red")
#Normality
qqPlot(college_model, main = "Q-Q Plot of Residuals")
## Princeton University Rutgers at New Brunswick
## 460 484
1.Linearity(선형성)
fitted value와 actual value 사이에 선형적인 관계가 나타난다는 사실을
확인할 수 있었습니다. 그러나, 데이터가 왼쪽으로 편향되어 있었습니다.
2.Homoscedasticity(등분산성)
residual plot을 확인한 결과, residual의 특정한 패턴은 확인할 수
없었습니다.
3.Normality(정규성)
Q-Q plot을 그려본 결과, residual이 정규분포를 따르지 않는다는 사실을
확인할 수 있었습니다.
결론)
해당 데이터셋에는 선형 모델이 적합하지 않다는 결론을 내릴 수
있었습니다.
num_vars <- College %>% select(Enroll, Outstate, Grad.Rate, PhD, S.F.Ratio, Apps, Accept, Private, Expend)
num_vars$Private <- as.numeric(num_vars$Private) - 1 # Convert variable to numeric type
cor_matrix <- cor(num_vars, use="complete.obs")
corrplot(cor_matrix, method = "number")
Apps은 Enroll, Accept과의 큰 양의 상관관계를 보이는 것을 확인할 수 있습니다.
ggplot(College, aes(x = Top10perc)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Histogram of Enroll in the College Dataset",
x = "Top10perc",
y = "count") +
theme_minimal()
ggplot(College, aes(x = Outstate)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Histogram of Enroll in the College Dataset",
x = "Top10perc",
y = "count") +
theme_minimal()
ggplot(College, aes(x = Expend)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Histogram of Enroll in the College Dataset",
x = "Expend",
y = "count") +
theme_minimal()
ggplot(College, aes(x = Enroll)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Histogram of Enroll in the College Dataset",
x = "Enroll",
y = "count") +
theme_minimal()
ggplot(College, aes(x = Accept)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Histogram of Accept in the College Dataset",
x = "Accept",
y = "count") +
theme_minimal()
3개의 예측변수 모두 right-skewed distribution을 따르지만, Top10prec가
가장 명확한 right-skewed distribution을 따르고 있다는 사실을 확인할 수
있었다.
따라서, 분포의 형태가 가장 명확인 Top10perc를 Apps의 예측변수로
사용하기로 한다.
set.seed(123)
Auto = na.omit(College)
#train : test = 0.8 : 0.2
trainIndex = createDataPartition(College$Apps, p = 0.8, list = FALSE)
trainset_college <- College[trainIndex, ]
testset_college <- College[-trainIndex, ]
ctrl = trainControl(method = "cv", number = 10) # 10-fold cross-validation
max_degree = 10
cv_errors_college = rep(NA, max_degree)
model = train(Apps ~ poly(Top10perc, degree=1, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[1] = mean(model$results$RMSE^2)
model = train(Apps ~ poly(Top10perc, degree=2, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[2] = mean(model$results$RMSE^2)
model = train(Apps ~ poly(Top10perc, degree=3, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[3] = mean(model$results$RMSE^2)
model = train(Apps ~ poly(Top10perc, degree=4, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[4] = mean(model$results$RMSE^2)
model = train(Apps ~ poly(Top10perc, degree=5, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[5] = mean(model$results$RMSE^2)
model = train(Apps ~ poly(Top10perc, degree=6, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[6] = mean(model$results$RMSE^2)
model = train(Apps ~ poly(Top10perc, degree=7, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[7] = mean(model$results$RMSE^2)
model = train(Apps ~ poly(Top10perc, degree=8, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[8] = mean(model$results$RMSE^2)
model = train(Apps ~ poly(Top10perc, degree=9, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[9] = mean(model$results$RMSE^2)
model = train(Apps ~ poly(Top10perc, degree=10, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[10] = mean(model$results$RMSE^2)
plot(1:max_degree, cv_errors_college, type = "b",
xlab = "Degree of polynomial", ylab = "Cross-Validation Error",
main = "Cross-Validation Error vs Degree of polynomial")
max_degree = 7
cv_errors_college = rep(NA, max_degree)
model_spline = train(Apps ~ ns(Top10perc, df = 1), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[1] = mean(model_spline$results$RMSE^2)
model_spline = train(Apps ~ ns(Top10perc, df = 2), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[2] = mean(model_spline$results$RMSE^2)
model_spline = train(Apps ~ ns(Top10perc, df = 3), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[3] = mean(model_spline$results$RMSE^2)
model_spline = train(Apps ~ ns(Top10perc, df = 4), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[4] = mean(model_spline$results$RMSE^2)
model_spline = train(Apps ~ ns(Top10perc, df = 5), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[5] = mean(model_spline$results$RMSE^2)
model_spline = train(Apps ~ ns(Top10perc, df = 6), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[6] = mean(model_spline$results$RMSE^2)
model_spline = train(Apps ~ ns(Top10perc, df = 7), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[7] = mean(model_spline$results$RMSE^2)
plot(1:max_degree, cv_errors_college, type = "b",
xlab = "Degree of polynomial", ylab = "Cross-Validation Error",
main = "Cross-Validation Error vs Degree of spline")
polynomial model과 spline model 둘 다 df가 4일 때, 가장 낮은 MSE를
보인다.
polynomial model 과 spline model의 df = 4 을 기준으로 모델을 fitting
하기로 한다.
poly_model_college_d1 = lm(Apps ~ poly(Top10perc, 4, raw=TRUE), data = trainset_college)
ns_model_college_d1 = lm(Apps ~ ns(Top10perc, df = 4), data = trainset_college)
linear_model_college = lm(Apps ~ Top10perc, data = trainset_college)
poly_predict_college = predict(poly_model_college_d1, newdata = testset_college)
ns_predict_college = predict(ns_model_college_d1, newdata = testset_college)
lm_predict_college = predict(linear_model_college, newdata = testset_college)
MSE_poly_college = mean((testset_college$Apps - poly_predict_college)^2)
MSE_ns_college = mean((testset_college$Apps - ns_predict_college)^2)
MSE_lm_college = mean((testset_college$Apps - lm_predict_college)^2)
print(MSE_poly_college)
## [1] 5944005
print(MSE_ns_college)
## [1] 5970815
print(MSE_lm_college)
## [1] 6385897
Polynomial model
degree : 4 MSE : 5944005
Natural spline
degree : 4 MSE : 5970815
Linear model MSE : 6385897
Apps 변수를 추정할 때, linear model보다 non-linear model이 약간 더
좋은 성능을 보이는 것을 확인할 수 있었습니다.
polynomial과 Natural spline의 경우에는 유의미한 성능 차이를 볼 수
없었습니다.